library(magrittr) # pipes library(dplyr) # data manipulation library(lattice) # conditional plotting library(ggplot2) # plotting library(DAAG) # cross-validation library(caret) # confusion matrices
Fundamental Techniques in Data Science with R
library(magrittr) # pipes library(dplyr) # data manipulation library(lattice) # conditional plotting library(ggplot2) # plotting library(DAAG) # cross-validation library(caret) # confusion matrices
With logistic regression we can model a binary outcome using a set of continuous and/or categorical predictors.
The linear probability model is a (bad) way to describe the conditional probabilities.
Because the linear probability model violates some assumptions of the linear model, the standard errors and hypothesis tests are not valid.
We will begin this lecture with a data set that records the survival of the passengers on the maiden voyage of the Titanic.
titanic <- readRDS("data/titanic.rds")
titanic %>% head()
## survived class name sex age ## 1 no 3rd Mr. Owen Harris Braund male 22 ## 2 yes 1st Mrs. John Bradley (Florence Briggs Thayer) Cumings female 38 ## 3 yes 3rd Miss. Laina Heikkinen female 26 ## 4 yes 1st Mrs. Jacques Heath (Lily May Peel) Futrelle female 35 ## 5 no 3rd Mr. William Henry Allen male 35 ## 6 no 3rd Mr. James Moran male 27 ## siblings_spouses parents_children fare ## 1 1 0 7.2500 ## 2 1 0 71.2833 ## 3 0 0 7.9250 ## 4 1 0 53.1000 ## 5 0 0 8.0500 ## 6 0 0 8.4583
str(titanic)
## 'data.frame': 887 obs. of 8 variables: ## $ survived : Factor w/ 2 levels "no","yes": 1 2 2 2 1 1 1 1 2 2 ... ## $ class : Factor w/ 3 levels "1st","2nd","3rd": 3 1 3 1 3 3 1 3 3 2 ... ## $ name : chr "Mr. Owen Harris Braund" "Mrs. John Bradley (Florence Briggs Thayer) Cumings" "Miss. Laina Heikkinen" "Mrs. Jacques Heath (Lily May Peel) Futrelle" ... ## $ sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ... ## $ age : num 22 38 26 35 35 27 54 2 27 14 ... ## $ siblings_spouses: int 1 1 0 1 0 0 0 3 0 1 ... ## $ parents_children: int 0 0 0 0 0 0 0 1 2 0 ... ## $ fare : num 7.25 71.28 7.92 53.1 8.05 ...
The outcome/dependent variable:
Some potential predictors:
We can investigating patterns in these data that relate to the survival probability.
Based on the creed “women and children first”, we could hypothesize that
age relates to the probability of survival \(\rightarrow\) younger passengers have a higher probability of survivalsex relates to the probability of survival \(\rightarrow\) females have a higher probability of survivalBased on socioeconomic status, we could hypothesize that
class relates to the probability of survival \(\rightarrow\) passengers traveling with higher classes have a higher probability of survivalAdd a numeric version of the outcome:
titanic <- mutate(titanic, survived_dummy = as.numeric(survived) - 1)
Split the data
set.seed(235711) filter <- c(rep(TRUE, 800), rep(FALSE, nrow(titanic) - 800)) %>% sample() train <- titanic[filter, ] test <- titanic[!filter, ]
age related to survival?ggplot(train, aes(x = age, y = survived_dummy)) +
geom_point() +
geom_smooth(method = "glm",
method.args = list(family = "binomial"),
se = FALSE)
train %$% table(class, survived)
## survived ## class no yes ## 1st 71 126 ## 2nd 89 77 ## 3rd 335 102
Higher classes seem to have higher probabilities of survival.
train %$% table(class, survived) %>% prop.table(margin = 1) %>% round(2)
## survived ## class no yes ## 1st 0.36 0.64 ## 2nd 0.54 0.46 ## 3rd 0.77 0.23
train %$% histogram(~ age | survived)
The distribution of age for survivors is different from the distribution of age for non-survivors.
glm(survived ~ age, data = train, family = "binomial") %>% summary()
## ## Call: ## glm(formula = survived ~ age, family = "binomial", data = train) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -1.0585 -0.9942 -0.9455 1.3692 1.5482 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -0.279353 0.167250 -1.670 0.0949 . ## age -0.007001 0.005175 -1.353 0.1761 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 1063.5 on 799 degrees of freedom ## Residual deviance: 1061.6 on 798 degrees of freedom ## AIC: 1065.6 ## ## Number of Fisher Scoring iterations: 4
train %$% histogram(~ sex | survived)
These distributions are very different!
glm(survived ~ sex, data = train, family = "binomial") %>% summary()
## ## Call: ## glm(formula = survived ~ sex, family = "binomial", data = train) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -1.6336 -0.6470 -0.6470 0.7818 1.8259 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 1.0287 0.1354 7.595 3.08e-14 *** ## sexmale -2.4863 0.1759 -14.139 < 2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 1063.48 on 799 degrees of freedom ## Residual deviance: 826.93 on 798 degrees of freedom ## AIC: 830.93 ## ## Number of Fisher Scoring iterations: 4
train %$% histogram(~ class | survived)
There is an obvious difference between the distributions of survivors and non-survivors over the classes.
glm(survived ~ class, data = train, family = "binomial") %>% summary()
## ## Call: ## glm(formula = survived ~ class, family = "binomial", data = train) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -1.4286 -0.7291 -0.7291 0.9454 1.7059 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 0.5736 0.1484 3.865 0.000111 *** ## class2nd -0.7184 0.2150 -3.341 0.000835 *** ## class3rd -1.7628 0.1866 -9.448 < 2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 1063.5 on 799 degrees of freedom ## Residual deviance: 961.7 on 797 degrees of freedom ## AIC: 967.7 ## ## Number of Fisher Scoring iterations: 4
fit1 <- glm(survived ~ age + sex + class, data = train, family = "binomial") partSummary(fit1, -1)
## Deviance Residuals: ## Min 1Q Median 3Q Max ## -2.6703 -0.6683 -0.4046 0.6090 2.4710 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 3.604399 0.386547 9.325 < 2e-16 ## age -0.033911 0.007477 -4.535 5.75e-06 ## sexmale -2.582049 0.197999 -13.041 < 2e-16 ## class2nd -1.158872 0.272027 -4.260 2.04e-05 ## class3rd -2.500839 0.265632 -9.415 < 2e-16 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 1063.48 on 799 degrees of freedom ## Residual deviance: 717.29 on 795 degrees of freedom ## AIC: 727.29 ## ## Number of Fisher Scoring iterations: 5
fit2 <- glm(survived ~ age * sex + age * class + sex * class,
data = train,
family = "binomial")
summary(fit2)$coefficients
## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 3.406236060 0.96589624 3.52650307 0.0004210863 ## age -0.001442227 0.02182564 -0.06607946 0.9473145633 ## sexmale -2.573984452 0.90134635 -2.85571075 0.0042940613 ## class2nd 1.265990912 1.24535856 1.01656740 0.3093592708 ## class3rd -3.228320737 0.92019259 -3.50830986 0.0004509635 ## age:sexmale -0.032651977 0.01826990 -1.78720022 0.0739051334 ## age:class2nd -0.062583155 0.02677554 -2.33732529 0.0194222767 ## age:class3rd -0.010038308 0.01981602 -0.50657548 0.6124527145 ## sexmale:class2nd -1.286498437 0.91743119 -1.40228331 0.1608306630 ## sexmale:class3rd 1.581979595 0.71402979 2.21556526 0.0267212900
Test the change in deviance.
anova(fit1, fit2, test = "Chisq")
## Analysis of Deviance Table ## ## Model 1: survived ~ age + sex + class ## Model 2: survived ~ age * sex + age * class + sex * class ## Resid. Df Resid. Dev Df Deviance Pr(>Chi) ## 1 795 717.29 ## 2 790 677.76 5 39.525 1.862e-07 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Compare information criteria
AIC(fit1, fit2)
## df AIC ## fit1 5 727.2894 ## fit2 10 697.7644
BIC(fit1, fit2)
## df BIC ## fit1 5 750.7124 ## fit2 10 744.6105
set.seed(235711) cv1 <- CVbinary(fit1, nfolds = 10)
## ## Fold: 10 5 7 9 4 2 8 3 6 1 ## Internal estimate of accuracy = 0.792 ## Cross-validation estimate of accuracy = 0.791
set.seed(235711) cv2 <- CVbinary(fit2, nfolds = 10)
## ## Fold: 10 5 7 9 4 2 8 3 6 1 ## Internal estimate of accuracy = 0.795 ## Cross-validation estimate of accuracy = 0.789
c(fit1 = cv1$acc.cv, fit2 = cv2$acc.cv)
## fit1 fit2 ## 0.79125 0.78875
To estimate a logistic regression model, we need to use maximum likelihood (ML) estimation.
Conceptually, ML estimation amounts to the following:
Let’s say we have the following \(N = 10\) observations.
(y <- rnorm(n = 10, mean = 5, sd = 1))
## [1] 6.682539 4.000324 4.644473 4.565293 4.649484 5.692962 5.061735 4.320619 ## [9] 3.975904 5.330619
In ML estimation, we would define different normal distributions.
We then compare the observed data to those distributions and see which distribution best fits the data.
We can state the assumptions of linear regression as follows:
Unlike linear regression, we don’t need to assume
For computational reasons, we also need the following:
Recall the two ways we can visualize our model’s predictions.
The \(logit()\) function linearizes the success probability.
The mean of the binomial distribution is the success probability: \(\bar{Y} = \pi\).
The variance is a function of the mean: \(\textrm{var}(Y) = \pi (1 - \pi)\).
There are many ways to define a residual in logistic regression.
## Compute different flavors of residual:
rr <- resid(fit2, type = "response")
rp <- resid(fit2, type = "pearson")
rd <- resid(fit2, type = "deviance")
## Compute fitted logit values:
eta <- predict(fit2, type = "link")
## Aggregate and plot the data:
rDat <- data.frame(Residual = c(rr, rp, rd),
Eta = rep(eta, 3),
Type = rep(c("Response", "Pearson", "Deviance"),
each = length(eta)
)
)
ggplot(rDat, aes(x = Eta, y = Residual, color = Type)) +
geom_point(alpha = 0.35) +
geom_smooth(se = FALSE)
plot(fit2, 1) # Pearson residuals
plot(fit2, 4)
plot(fit2, 5)
etaHat <- predict(fit2, newdata = test, type = "link", se.fit = TRUE) sapply(etaHat, head)
## $fit ## 6 25 26 28 32 39 ## -2.00566733 0.08607105 -0.25834498 0.18446174 3.33700917 -0.02873430 ## ## $se.fit ## 6 25 26 28 32 39 ## 0.1839848 0.2629204 0.2770250 0.3590614 0.6510206 0.1873309 ## ## $residual.scale ## [1] 1
Calculate \(\hat{\pi}\) directly using the predict() function:
piHat <- predict(fit2, newdata = test, type = "response")
Apply the logistic function, plogis(), to the \(\hat{\eta}\) values we computed earlier:
piHat2 <- plogis(etaHat$fit) head(cbind(piHat, piHat2))
## piHat piHat2 ## 6 0.1186092 0.1186092 ## 25 0.5215045 0.5215045 ## 26 0.4357706 0.4357706 ## 28 0.5459851 0.5459851 ## 32 0.9656768 0.9656768 ## 39 0.4928169 0.4928169
yHat <- ifelse(piHat > 0.5, "yes", "no") %>% factor() table(yHat)
## yHat ## no yes ## 66 21
Calculate the confusion matrix
cm <- table(pred = yHat, true = test$survived) cm
## true ## pred no yes ## no 47 19 ## yes 3 18
(sensitivity <- cm["yes", "yes"] / sum(cm[ , "yes"]))
## [1] 0.4864865
(specificity <- cm["no", "no"] / sum(cm[ , "no"]))
## [1] 0.94
(acc <- diag(cm) %>% sum() / sum(cm))
## [1] 0.7471264
We can also use the caret::confusionMatrix() function:
confusionMatrix(yHat, reference = test$survived)
## Confusion Matrix and Statistics ## ## Reference ## Prediction no yes ## no 47 19 ## yes 3 18 ## ## Accuracy : 0.7471 ## 95% CI : (0.6425, 0.8342) ## No Information Rate : 0.5747 ## P-Value [Acc > NIR] : 0.0006273 ## ## Kappa : 0.4519 ## ## Mcnemar's Test P-Value : 0.0013838 ## ## Sensitivity : 0.9400 ## Specificity : 0.4865 ## Pos Pred Value : 0.7121 ## Neg Pred Value : 0.8571 ## Prevalence : 0.5747 ## Detection Rate : 0.5402 ## Detection Prevalence : 0.7586 ## Balanced Accuracy : 0.7132 ## ## 'Positive' Class : no ##
First, we’ll add the predicted quantities to the training data.
tmp <- predict(fit2, type = "link", se.fit = TRUE) train$eta <- tmp$fit train$se <- tmp$se.fit train$pi <- plogis(tmp$fit)
Next, we add confidence intervals for the fitted values.
train %<>%
mutate(etaLower = eta - 1.96 * se,
etaUpper = eta + 1.96 * se,
piLower = plogis(etaLower),
piUpper = plogis(etaUpper)
)
ggplot(train, aes(x = age, y = eta)) +
geom_line(aes(color = class), lwd = 1) +
geom_ribbon(aes(ymin = etaLower, ymax = etaUpper, fill = class), alpha = 0.2) +
ylab("Logit of Survival") +
facet_wrap(vars(sex))
ggplot(train, aes(x = age, y = pi)) +
geom_ribbon(aes(ymin = piLower, ymax = piUpper, fill = class), alpha = 0.2) +
geom_line(aes(color = class), lwd = 1) +
ylab("Probability of Survival") +
facet_wrap(vars(sex))
Augment the data with fitted values from the additive model:
tmp <- predict(fit1, type = "link", se = TRUE)
train$eta <- tmp$fit
train$se <- tmp$se.fit
train$pi <- plogis(tmp$fit)
train %<>%
mutate(etaLower = eta - 1.96 * se,
etaUpper = eta + 1.96 * se,
piLower = plogis(etaLower),
piUpper = plogis(etaUpper)
)
ggplot(train, aes(x = age, y = eta)) +
geom_line(aes(color = class), lwd = 1) +
geom_ribbon(aes(ymin = etaLower, ymax = etaUpper, fill = class), alpha = 0.2) +
ylab("Logit of Survival") +
facet_wrap(vars(sex))
ggplot(train, aes(x = age, y = pi)) +
geom_ribbon(aes(ymin = piLower, ymax = piUpper, fill = class), alpha = 0.2) +
geom_line(aes(color = class), lwd = 1) +
ylab("Probability of Survival") +
facet_wrap(vars(sex))